/* calculate industry wage and employment measures */
/* 1997-2001 SIC 2 digit, 2002 NAICS 4 digit, 2003-2017 NAICS 3 digit */

libname edu '!userprofile\\Dropbox\Education\Replication\Data';
/* change wage to 1997 dollars */
data cpi; set edu.cpi; if year >=1997; cpi = cpi/160.5; run;

proc sql;
  create table blswage_raw as
  select a.*, b.cpi
  from edu.blswage_raw as a, cpi as b
  where a.year = b.year;
quit; 

data blswage_raw (drop = cpi);
  set blswage_raw;
  a_mean = a_mean/cpi;
  if year <= 1998 then naics = trim(substr(naics,1,2));
run;

* get all distinct industry-year;
proc sort data = blswage_raw out = industry_year (keep = naics year) nodupkey; by year naics; quit;

* get "all-industry" wage (occ_code = 00-0000 for 1999-2016 or 00000 for 1997-1998);
proc sql;
  create table industrywage as
  select distinct a.*, b.*
  from industry_year as a left join blswage_raw as b
  on a.year = b.year and a.naics = b.naics and b.occ_code = "00-0000";
quit;

data industrywage2 (keep = year naics);
  set industrywage;
  if occ_code = "";
run;

data industrywage;
  set industrywage;
  if occ_code = "" then delete;
run;

proc sql;
  create table industrywage3 as
  select distinct a.*, b.*
  from industrywage2 as a left join blswage_raw as b
  on a.year = b.year and a.naics = b.naics and b.occ_code = "00000";
quit;

proc append base = industrywage data = industrywage3; run;

* for 2002, we have 4-digit NAICS.  Calculate the weighted-average mean and median (weighted by tot_emp) and sum of tot_emp across each 3-digit NAICS;
data industrywage2002;
  set industrywage;
  if year = 2002;
  naics = substr(naics, 1, 3);
run;

proc sort data = industrywage2002; by naics; run;

proc means data = industrywage2002 noprint;
  by naics;
  var a_mean;
  weight tot_emp;
  output out = industrywage20022 mean = a_mean;
run;

proc means data = industrywage2002 noprint;
  by naics;
  var tot_emp;
  output out = industrywage20023 sum = tot_emp;
run;

data industrywage20022 (keep = year naics a_mean);
  set industrywage20022;
  year = 2002;
run;

proc sql;
  create table industrywage20022 as
  select a.*, b.tot_emp
  from industrywage20022 as a, industrywage20023 as b
  where a.naics = b.naics;
quit;

data industrywage; set industrywage; if year = 2002 then delete; run;
proc append base = industrywage data = industrywage20022 force; run;

* get all "detailed" occ_code (occ_group = detailed or occ_code last digit is not zero after 1999, occ_code last two digits not zero before 1999);
data crosssectionwage;
  set blswage_raw;
  if year >= 1999;
  if occ_group = "detailed" or substr(occ_code,7,1)~="0";
  if year = 2002 then delete; * 2002 data are at 4-digit NAICS level. not comparable;
run;

data crosssectionwage2;
  set blswage_raw;
  if year<=1998;
  if substr(occ_code,4,2)~='00';
run;

proc append base = crosssectionwage data = crosssectionwage2; run;
proc sort data = crosssectionwage; by year naics; run;

/* get the average wage and total employment jobs that require bachelor's degrees and different levels of experience */
/* only look at occ codes that are mapped to cip (major) codes, and majors that correspond to the industry */
/* occ codes in 1997-98 are different, need oessoc map */
/* occ_degree map comes from BLS */
proc sql;
  create table wage as
  select distinct a.*, b.degree, b.experience, b.major
  from crosssectionwage as a, edu.occ_degree as b
  where a.occ_code = b.occcode and a.year >= 2010;
quit;

proc sql;
  create table wage2010 as
  select distinct a.*, b.degree, b.experience, b.major
  from crosssectionwage as a, edu.occ_degree as b
  where a.occ_code = b.occcode2010 and 1998 < a.year < 2010;
quit;

proc sql;
  create table oessoc as
  select distinct a.*, b.degree, b.experience, b.major
  from edu.oessoc as a, edu.occ_degree as b /* OES to SOC map comes from Ohio's Bureau of Labor Market Information */
  where a.soccode = b.occcode2010;
quit;

data oessoc (drop = oescode); set oessoc; if major = . then delete; oescode2 = put(oescode,5.); run;

proc sql;
  create table wage1998 as
  select distinct a.*, b.degree, b.experience, b.major, b.soccode
  from crosssectionwage as a, oessoc as b
  where a.occ_code = b.oescode2 and a.year <= 1998;
quit;

data wage1998 (drop = soccode); set wage1998; occ_code = soccode; naics = trim(substr(naics,1,2)); run;

proc append base = wage data = wage2010; run;
proc append base = wage data = wage1998; run;

/* save this dataset for later */
data wagesoc; set wage; run;

* calculate 2002 separately;
* first calculate weighted average mean and sum of tot_emp for each occ_code within each 3-digit NAICS;
data crosssectionwage2002;
  set blswage_raw;
  if substr(occ_code,7,1)~="0";
  if year = 2002;
  naics = substr(naics, 1, 3);
run;

proc sort data = crosssectionwage2002; by naics occ_code; run;

proc means data = crosssectionwage2002 noprint;
  by naics occ_code;
  var a_mean;
  weight tot_emp;
  output out = occcode2002 mean = a_mean;
run;

proc means data = crosssectionwage2002 noprint;
  by naics occ_code;
  var tot_emp;
  output out = occcode20022 sum = tot_emp;
run;

proc sql;
  create table occcode2002 as
  select a.*, b.tot_emp
  from occcode2002 as a, occcode20022 as b
  where a.naics = b.naics and a.occ_code = b.occ_code;
quit;

/* get the average wage and total employment jobs that require bachelor's degrees and different levels of experience */
/* occ codes in 1997-98 are different, will be dropped in this analysis */
proc sql;
  create table wage2002 as
  select distinct a.*, b.degree, b.experience, b.major
  from occcode2002 as a left join edu.occ_degree as b
  on a.occ_code = b.occcode2010;
quit;

/* save this dataset for later */
data wagesoc2002; set wage2002; run;

/* construct a wage/employment dataset at major level */
data wagesoc2002; set wagesoc2002; year = 2002; run;
data wagesoc1; set wagesoc; run;

proc append base = wagesoc1 data = wagesoc2002 force; run;

data wagesoc1 (keep = naics naics_title occ_code occ_title tot_emp a_mean year major degree experience);
  set wagesoc1;
  if major = . then delete;
run;

proc sort data = wagesoc1; by year major occ_code naics; run;

/* get wage/employment for jobs that require Bachelor's degree and no experience */
data wagesoc_none;
  set wagesoc1;
  if degree = "Bachelor's degree";
  if experience = "None";
run;

/* for occ codes that are mapped to more than three majors, delete */
/* the only code that is deleted is 17-2199: Engineers, All Others */
proc sort data = wagesoc_none out = wageocc nodupkey; by year occ_code major; run;
proc means data = wageocc noprint;
  var year;
  by year occ_code;
  output out=wageocc2 n = year_n;
run;

data wageocc2; set wageocc2; 
if year_n <= 3; 
run;

proc sql;
  create table wagesoc_none as
  select a.*, b.year_n
  from wagesoc_none as a, wageocc2 as b
  where a.year = b.year and a.occ_code = b.occ_code;
quit;

data wagesoc_none; set wagesoc_none; tot_emp = tot_emp/year_n; run;

proc means data = wagesoc_none noprint;
  by year major;
  var a_mean;
  weight tot_emp;
  output out = wagesoc_none1 mean = a_mean;
run;

proc means data = wagesoc_none noprint;
  by year major;
  var tot_emp;
  output out = wagesoc_none2 sum = tot_emp;
run;

/* separate occupation codes based on wage */
/* the idea is to check whether high paid jobs in a major see the same effect */
/* convert sic2 to naics for data before 2002 */
data wagesoc_none_temp (drop = naics);
  set wagesoc_none;
  naics3 = naics + 1 - 1;
  if naics3 >= 100000 then naics3 = naics3/1000;
run;

proc sql;
  create table wagesoc_none_temp2 as
  select distinct a.*, b.naics3 as naics32
  from wagesoc_none_temp as a left join edu.naics_3_sic_2 as b
  on a.naics3 = b.sic2;
quit;

data wagesoc_none_temp2 (drop = naics3 naics32);
  set wagesoc_none_temp2;
  if naics3 < 100 then naics3 = naics32;
  if naics3 = . then delete;
  naics = naics3;
run;

/* first calculate average wage in each major-naics in each year */
proc sort data = wagesoc_none_temp2;
  by year major naics occ_code;
run;

proc means data = wagesoc_none_temp2 noprint;
  var a_mean;
  by year major naics occ_code;
  weight tot_emp;
  output out = major_occ 
  mean = a_wage;
run;

proc means data = wagesoc_none_temp2 noprint;
  var tot_emp;
  by year major naics occ_code;
  output out = major_occ2
  sum = tot_emp;
run;

proc sql;
  create table major_occ as
  select a.*, b.tot_emp
  from major_occ as a, major_occ2 as b
  where a.year = b.year and a.major = b.major and a.naics=b.naics and a.occ_code = b.occ_code;
quit;

/* calculate average wage at t-4 year */
proc sql;
  create table occ_past as
  select distinct a.year, a.major, a.naics, a.occ_code, b.a_wage
  from major_occ as a, major_occ as b
  where a.major = b.major and a.naics = b.naics and a.occ_code = b.occ_code and
  4 <= a.year - b.year <= 4;
quit;

proc sort data = occ_past;
  by year major naics occ_code;
run;

proc means data = occ_past noprint;
  var a_wage;
  by year major naics occ_code;
  output out = occ_past2 
  mean = a_wage;
run;

/* calculate the median wage */
proc sort data = occ_past2; 
  by year major;
run;

proc means data = occ_past2 noprint;
  by year major;
  var a_wage;
  output out = major_median
  median = med_wage;
quit;

proc sql;
  create table occ_past2 as
  select a.*, b.med_wage
  from occ_past2 as a, major_median as b
  where a.year = b.year and a.major = b.major;
quit;

/* calculate two series: one for >= median, one for < median */
data occ_past3;
  set occ_past2;
  if a_wage > med_wage then high = 1;
  if a_wage <= med_wage then high = 0;
run;

proc sql;
  create table major_occ4 as
  select a.*, b.high
  from major_occ as a, occ_past3 as b
  where a.year = b.year and a.major = b.major and a.naics = b.naics and a.occ_code = b.occ_code;
quit;

data major_occ3 (drop = high);
  set major_occ4;
  if high = 1 then do a_wage_high = a_wage; tot_emp_high = tot_emp; end;
  if high = 0 then do a_wage_low = a_wage; tot_emp_low = tot_emp; end;
run;


proc sort data = major_occ3;
  by year major;
run;

proc means data = major_occ3 noprint;
  by year major;
  var a_wage_high;
  weight tot_emp_high;
  output out = wagesoc_none3 mean = a_mean_high;
run;

proc means data = major_occ3 noprint;
  by year major;
  var a_wage_low;
  weight tot_emp_low;
  output out = wagesoc_none4 mean = a_mean_low;
run;

proc means data = major_occ3 noprint;
  by year major;
  var tot_emp_high tot_emp_low;
  output out = wagesoc_none5 sum = tot_emp_high tot_emp_low;
run;

/* higher level (experienced) jobs */
data wagesoc_10;
  set wagesoc1;
  if experience = "5 years or more" or experience = "Less than 5 years" or degree = "Doctoral or professional degree";
run;

proc sql;
  create table wagesoctemp1 as
  select distinct a.*, b.naics_3
  from wagesoc_10 as a, edu.naics_3_major as b
  where a.major = b.major and a.major ~= .;
quit;

proc sql;
  create table wagesoctemp2 as
  select distinct a.*, b.sic_2 as naics_3
  from wagesoc_10 as a, edu.sic_2_major as b
  where a.major = b.major and a.major ~= .;
quit;

proc append base = wagesoctemp1 data = wagesoctemp2; run;

data wagesoc_10 (drop = naics2);
  set wagesoctemp1;
  naics2 = naics + 1 - 1;
  if naics2 = naics_3 or naics2/1000 = naics_3;
run;

proc sort data = wagesoc_10; by year major; run;

proc means data = wagesoc_10 noprint;
  by year major;
  var a_mean;
  weight tot_emp;
  output out = wagesoc_101 mean = a_mean;
run;

proc means data = wagesoc_10 noprint;
  by year major;
  var tot_emp;
  output out = wagesoc_102 sum = tot_emp;
run;

/* get all major-years */
proc sql;
  create table all as
  select distinct year, major
  from wagesoc_none;
quit;

/* merge all data together */
proc sql;
  create table all2 as
  select a.*, log(b.a_mean) as annual_wage
  from all as a left join wagesoc_none1 as b
  on a.year = b.year and a.major = b.major;
quit;

proc sql;
  create table all3 as
  select a.*, log(b.tot_emp) as tot_emp
  from all2 as a left join wagesoc_none2 as b
  on a.year = b.year and a.major = b.major;
quit;

proc sql;
  create table all3 as
  select a.*, log(b.a_mean_high) as annual_wage_high
  from all3 as a left join wagesoc_none3 as b
  on a.year = b.year and a.major = b.major;
quit;

proc sql;
  create table all3 as
  select a.*, log(b.a_mean_low) as annual_wage_low
  from all3 as a left join wagesoc_none4 as b
  on a.year = b.year and a.major = b.major;
quit;

proc sql;
  create table all3 as
  select a.*, log(b.tot_emp_high) as tot_emp_high, log(b.tot_emp_low) as tot_emp_low
  from all3 as a left join wagesoc_none5 as b
  on a.year = b.year and a.major = b.major;
quit;

proc sql;
  create table all4 as
  select a.*, log(b.a_mean) as annual_wage_10
  from all3 as a left join wagesoc_101 as b
  on a.year = b.year and a.major = b.major;
quit;

proc sql;
  create table all5 as
  select a.*, log(b.tot_emp) as tot_emp_10
  from all4 as a left join wagesoc_102 as b
  on a.year = b.year and a.major = b.major;
quit;

proc sort data = all5; by major year; run;

data all6;
  set all5;
  by major year;
  delta_tot_emp = tot_emp - lag(tot_emp);
  delta_tot_emp_high = tot_emp_high - lag(tot_emp_high);
  delta_tot_emp_low = tot_emp_low - lag(tot_emp_low);
  delta_tot_emp_10 = tot_emp_10 - lag(tot_emp_10);
  if first.major then do;
  delta_tot_emp = .;
  delta_tot_emp_high = .;
  delta_tot_emp_low = .;
  delta_tot_emp_10 = .; end;
run;

data edu.wagesoc; set all6; run;
